Explore the Lifebook Protection Data

Author

Mick Cooney

Published

August 19, 2025

This workbook was created using the ‘dataexpks’ template:

https://github.com/DublinLearningGroup/dataexpks

1 Introduction

This workbook performs the basic data exploration of the dataset.

Show code
dataexp_level_exclusion_threshold <- 100

dataexp_cat_level_count <- 40
dataexp_hist_bins_count <- 50

2 Load Data

First we load the dataset.

Show code
rawdata_tbl <- read_parquet("data/lifeins_policybook_inoutforce.parquet")

glimpse(rawdata_tbl)
Rows: 1,500,000
Columns: 25
$ policy_id               <chr> "C010000009", "C010000019", "C010000032", "C01…
$ countyname              <chr> "Dublin City", "Kilkenny County", "South Dubli…
$ edname                  <chr> "North Dock B", "Kilkenny Rural", "Clondalkin-…
$ nuts3name               <chr> "Dublin", "South-East (IE)", "Dublin", "West",…
$ sa_id                   <chr> "A268108011", "A097063020", "A267050012", "A06…
$ cluster_id              <chr> "n6_c0", "n6_c0", "n6_c4", "n6_c1", "n6_c5", "…
$ prod_type               <fct> protection, protection, pension, pension, prot…
$ prem_type               <chr> "RP", "RP", "SP", "RP", "RP", "RP", "RP", "RP"…
$ prem_freq               <chr> "12", "12", NA, "12", "12", "12", "12", "12", …
$ prem_ape                <dbl> 4172.34, 1150.17, 600.00, 3552.98, 313.05, 731…
$ prem_risk               <dbl> 2980.2396, 821.5525, NA, NA, 223.6095, 522.815…
$ policy_startdate        <date> 1990-01-02, 1990-01-02, 1990-01-02, 1990-01-0…
$ policy_enddate          <date> 2010-01-02, 2000-01-02, 2067-06-13, 2091-01-1…
$ policy_duration         <int> 20, 10, NA, NA, 20, 15, 20, 15, 15, 20, 20, 5,…
$ mort_rating             <dbl> 200, 100, NA, NA, 200, 100, NA, 200, 100, 100,…
$ sum_assured             <dbl> 150000, 450000, NA, NA, 250000, 450000, NA, 50…
$ dob_life1               <date> 1937-07-22, 1964-08-19, 1947-06-13, 1971-01-1…
$ gender_life1            <chr> "F", "M", "F", "M", "F", "F", "M", "M", "M", "…
$ smoker_life1            <chr> "S", "N", "N", "N", "S", "N", "S", "S", "N", "…
$ isjointlife             <lgl> TRUE, TRUE, NA, NA, FALSE, TRUE, NA, FALSE, FA…
$ islifeonly              <lgl> TRUE, FALSE, NA, NA, TRUE, TRUE, NA, TRUE, TRU…
$ mortgage_status         <chr> "MORTDECR", "MORTDECR", NA, NA, "MORTDECR", "T…
$ policy_status           <chr> "lapsed", "lapsed", "lapsed", "lapsed", "lapse…
$ policy_statuschangedate <date> 1998-10-02, 1993-08-02, 1998-04-02, 1996-09-0…
$ lapsed                  <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE…

2.1 Perform Quick Data Cleaning

Show code
### _TEMPLATE_
### Do simple datatype transforms and save output in data_tbl
data_tbl <- rawdata_tbl %>%
  set_colnames(
    names(.) |> to_snake_case()
    )

glimpse(data_tbl)
Rows: 1,500,000
Columns: 25
$ policy_id               <chr> "C010000009", "C010000019", "C010000032", "C01…
$ countyname              <chr> "Dublin City", "Kilkenny County", "South Dubli…
$ edname                  <chr> "North Dock B", "Kilkenny Rural", "Clondalkin-…
$ nuts_3_name             <chr> "Dublin", "South-East (IE)", "Dublin", "West",…
$ sa_id                   <chr> "A268108011", "A097063020", "A267050012", "A06…
$ cluster_id              <chr> "n6_c0", "n6_c0", "n6_c4", "n6_c1", "n6_c5", "…
$ prod_type               <fct> protection, protection, pension, pension, prot…
$ prem_type               <chr> "RP", "RP", "SP", "RP", "RP", "RP", "RP", "RP"…
$ prem_freq               <chr> "12", "12", NA, "12", "12", "12", "12", "12", …
$ prem_ape                <dbl> 4172.34, 1150.17, 600.00, 3552.98, 313.05, 731…
$ prem_risk               <dbl> 2980.2396, 821.5525, NA, NA, 223.6095, 522.815…
$ policy_startdate        <date> 1990-01-02, 1990-01-02, 1990-01-02, 1990-01-0…
$ policy_enddate          <date> 2010-01-02, 2000-01-02, 2067-06-13, 2091-01-1…
$ policy_duration         <int> 20, 10, NA, NA, 20, 15, 20, 15, 15, 20, 20, 5,…
$ mort_rating             <dbl> 200, 100, NA, NA, 200, 100, NA, 200, 100, 100,…
$ sum_assured             <dbl> 150000, 450000, NA, NA, 250000, 450000, NA, 50…
$ dob_life_1              <date> 1937-07-22, 1964-08-19, 1947-06-13, 1971-01-1…
$ gender_life_1           <chr> "F", "M", "F", "M", "F", "F", "M", "M", "M", "…
$ smoker_life_1           <chr> "S", "N", "N", "N", "S", "N", "S", "S", "N", "…
$ isjointlife             <lgl> TRUE, TRUE, NA, NA, FALSE, TRUE, NA, FALSE, FA…
$ islifeonly              <lgl> TRUE, FALSE, NA, NA, TRUE, TRUE, NA, TRUE, TRU…
$ mortgage_status         <chr> "MORTDECR", "MORTDECR", NA, NA, "MORTDECR", "T…
$ policy_status           <chr> "lapsed", "lapsed", "lapsed", "lapsed", "lapse…
$ policy_statuschangedate <date> 1998-10-02, 1993-08-02, 1998-04-02, 1996-09-0…
$ lapsed                  <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE…

2.2 Create Derived Variables

We now create derived features useful for modelling. These values are new variables calculated from existing variables in the data.

Show code
data_tbl <- rawdata_tbl |>
  filter(prod_type == "protection") |>
  mutate(
    weeks_to_now    = difftime(
      as.Date("2016-01-01"),   policy_startdate, units = "weeks"
      ) |> as.numeric(),
    weeks_to_status = difftime(
      policy_statuschangedate, policy_startdate, units = "weeks"
      ) |> as.numeric(),
    
    policy_lifetime = if_else(
      policy_status == "inforce",
      weeks_to_now,
      weeks_to_status
      )
    )

glimpse(data_tbl)
Rows: 773,337
Columns: 28
$ policy_id               <chr> "C010000009", "C010000019", "C010000091", "C01…
$ countyname              <chr> "Dublin City", "Kilkenny County", "Cork County…
$ edname                  <chr> "North Dock B", "Kilkenny Rural", "Fermoy Rura…
$ nuts3name               <chr> "Dublin", "South-East (IE)", "South-West (IE)"…
$ sa_id                   <chr> "A268108011", "A097063020", "A047151005", "A02…
$ cluster_id              <chr> "n6_c0", "n6_c0", "n6_c5", "n6_c2", "n6_c0", "…
$ prod_type               <fct> protection, protection, protection, protection…
$ prem_type               <chr> "RP", "RP", "RP", "RP", "RP", "RP", "RP", "RP"…
$ prem_freq               <chr> "12", "12", "12", "12", "12", "12", "12", "12"…
$ prem_ape                <dbl> 4172.34, 1150.17, 313.05, 731.94, 2938.13, 799…
$ prem_risk               <dbl> 2980.2396, 821.5525, 223.6095, 522.8157, 2098.…
$ policy_startdate        <date> 1990-01-02, 1990-01-02, 1990-01-02, 1990-01-0…
$ policy_enddate          <date> 2010-01-02, 2000-01-02, 2010-01-02, 2005-01-0…
$ policy_duration         <int> 20, 10, 20, 15, 15, 15, 20, 5, 5, 15, 20, 20, …
$ mort_rating             <dbl> 200, 100, 200, 100, 200, 100, 100, 200, 150, 1…
$ sum_assured             <dbl> 150000, 450000, 250000, 450000, 500000, 200000…
$ dob_life1               <date> 1937-07-22, 1964-08-19, 1971-02-13, 1965-10-0…
$ gender_life1            <chr> "F", "M", "F", "F", "M", "M", "F", "M", "M", "…
$ smoker_life1            <chr> "S", "N", "S", "N", "S", "N", "N", "S", "Q", "…
$ isjointlife             <lgl> TRUE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, …
$ islifeonly              <lgl> TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRU…
$ mortgage_status         <chr> "MORTDECR", "MORTDECR", "MORTDECR", "TERM", "T…
$ policy_status           <chr> "lapsed", "lapsed", "lapsed", "lapsed", "lapse…
$ policy_statuschangedate <date> 1998-10-02, 1993-08-02, 1991-10-02, 1990-08-0…
$ lapsed                  <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, TRU…
$ weeks_to_now            <dbl> 1356.429, 1356.429, 1356.429, 1356.429, 1356.4…
$ weeks_to_status         <dbl> 456.428571, 186.857143, 91.142857, 30.285714, …
$ policy_lifetime         <dbl> 456.428571, 186.857143, 91.142857, 30.285714, …

3 Perform Basic Checks on Data

We now want to look at some very high level checks on the data, and we leverage some of the functionality provided by DataExplorer.

3.1 Create High-Level Visualisations

We first want to look at a visualisation of some high-level summarys of the meta-data on this dataset. This gives us a quick view of the categorical and 1numeric values in the dataset, as well as the proportions of missing values.

Show code
data_tbl |>
  plot_intro(
    title   = "High Level Table Summary",
    ggtheme = theme_cowplot()
    )

3.2 Check Missing Values

Before we do anything with the data, we first check for missing values in the dataset. In some cases, missing data is coded by a special character rather than as a blank, so we first correct for this.

Show code
### _TEMPLATE_
### ADD CODE TO CORRECT FOR DATA ENCODING HERE

With missing data properly encoded, we now visualise the missing data in a number of different ways.

3.2.1 Univariate Missing Data

Show code
data_tbl |>
  plot_missing(
    title   = "Summary of Data Missingness",
    group   = list(Good = 0.05, Acceptable = 0.2, Bad = 0.8, Remove = 1),
    ggtheme = theme_cowplot()
    )

We now want to repeat this plot but only for those columns that have some missing values.

Show code
data_tbl |>
  plot_missing(
    title        = "Summary of Data Missingness (missing variables only)",
    missing_only = TRUE,
    group        = list(Good = 0.05, Acceptable = 0.2, Bad = 0.8, Remove = 1),
    ggtheme      = theme_cowplot()
    )

3.2.2 Multivariate Missing Data

It is useful to get an idea of what combinations of variables tend to have variables with missing values simultaneously, so to construct a visualisation for this we create a count of all the times given combinations of variables have missing values, producing a heat map for these combination counts.

Show code
dataexp_missing_group_count <- 20

row_count <- rawdata_tbl |> nrow()

count_nas <- ~ .x |> are_na() |> vec_cast(integer())

missing_vizdata_tbl <- rawdata_tbl |>
  mutate(across(everything(), count_nas)) %>%
  mutate(label = pmap_chr(., str_c)) |>
  group_by(label) |>
  mutate(
    miss_count = n(),
    miss_prop  = miss_count / row_count
    ) |>
  slice_max(order_by = miss_prop, n = 1, with_ties = FALSE) |>
  ungroup() |>
  pivot_longer(
    !c(label, miss_count, miss_prop),
    names_to = "variable_name",
    values_to = "presence"
    ) |>
  mutate(
    prop_label = sprintf("%6.4f", miss_prop)
    )

top10_data_tbl <- missing_vizdata_tbl |>
  select(label, miss_prop) |>
  distinct() |>
  slice_max(order_by = miss_prop, n = dataexp_missing_group_count)

missing_plot_tbl <- missing_vizdata_tbl |>
  semi_join(top10_data_tbl, by = "label")

ggplot(missing_plot_tbl) +
  geom_tile(aes(x = variable_name, y = prop_label, fill = presence), height = 0.8) +
  scale_fill_continuous() +
  scale_x_discrete(position = "top", labels = ~ abbreviate(.x, minlength = 10)) +
  xlab("Variable") +
  ylab("Proportion of Rows") +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 90, vjust = 0.5)
    )

This visualisation takes a little explaining.

Each row represents a combination of variables with simultaneous missing values. For each row in the graphic, the coloured entries show which particular variables are missing in that combination. The proportion of rows with that combination is displayed in both the label for the row and the colouring for the cells in the row.

3.3 Inspect High-level-count Categorical Variables

With the raw data loaded up we now remove obvious unique or near-unique variables that are not amenable to basic exploration and plotting.

Show code
coltype_lst <- create_coltype_list(data_tbl)

count_levels <- ~ .x |> unique() |> length()

catvar_valuecount_tbl <- data_tbl |>
  summarise(
    .groups = "drop",

    across(coltype_lst$split$discrete, count_levels)
    ) |>
  pivot_longer(
    cols      = everything(),
    names_to  = "var_name",
    values_to = "level_count"
    ) |>
  arrange(desc(level_count))

print(catvar_valuecount_tbl)
# A tibble: 13 × 2
   var_name        level_count
   <chr>                 <int>
 1 policy_id            773337
 2 sa_id                 18481
 3 edname                 3109
 4 countyname               34
 5 nuts3name                 8
 6 cluster_id                6
 7 prem_freq                 3
 8 smoker_life1              3
 9 mortgage_status           3
10 policy_status             3
11 gender_life1              2
12 prod_type                 1
13 prem_type                 1
Show code
row_count <- data_tbl |> nrow()

cat(glue("Dataset has {row_count} rows\n"))
Dataset has 773337 rows

Now that we a table of the counts of all the categorical variables we can automatically exclude unique variables from the exploration, as the level count will match the row count.

Show code
unique_vars <- catvar_valuecount_tbl |>
  filter(level_count == row_count) |>
  pull(var_name)

print(unique_vars)
[1] "policy_id"
Show code
explore_data_tbl <- data_tbl |>
  select(-one_of(unique_vars))

Having removed the unique identifier variables from the dataset, we may also wish to exclude categoricals with high level counts also, so we create a vector of those variable names.

Show code
highcount_vars <- catvar_valuecount_tbl |>
  filter(level_count >= dataexp_level_exclusion_threshold,
         level_count < row_count) |>
  pull(var_name)

cat(str_c(highcount_vars, collapse = ", "))
sa_id, edname

We now can continue doing some basic exploration of the data. We may also choose to remove some extra columns from the dataset.

Show code
### You may want to comment out these next few lines to customise which
### categoricals are kept in the exploration.
drop_vars <- c(highcount_vars)

if (length(drop_vars) > 0) {
  explore_data_tbl <- explore_data_tbl |>
      select(-one_of(drop_vars))

  cat(str_c(drop_vars, collapse = ", "))
}
sa_id, edname

4 Univariate Data Exploration

Now that we have loaded the data we can prepare it for some basic data exploration.

4.1 Quick Univariate Data Summaries

We use a number of summary visualisations provided by DataExplorer: a facet plot across each variable with categorical variables getting bar plots and numerical plots getting histograms.

We first look at the barplots of categorical variables.

Show code
plot_bar(
    data_tbl,
    ncol    = 2,
    nrow    = 2,
    title   = "Barplots of Data",
    ggtheme = theme_cowplot()
    )
7 columns ignored with more than 50 categories.
policy_id: 773337 categories
edname: 3109 categories
sa_id: 18481 categories
policy_startdate: 6710 categories
policy_enddate: 18877 categories
dob_life1: 25130 categories
policy_statuschangedate: 9350 categories

We then have a quick look at histograms of the numeric variables.

Show code
plot_histogram(
    data_tbl,
    ncol    = 2,
    nrow    = 2,
    title   = "Histograms of Data",
    ggtheme = theme_cowplot()
    )

Finally, we split the remaining variables into different categories and then produce a sequence of plots for each variable.

Show code
coltype_lst <- create_coltype_list(explore_data_tbl)

print(coltype_lst)
$split
$split$continuous
[1] "prem_ape"        "prem_risk"       "policy_duration" "mort_rating"    
[5] "sum_assured"     "weeks_to_now"    "weeks_to_status" "policy_lifetime"

$split$datetime
[1] "policy_startdate"        "policy_enddate"         
[3] "dob_life1"               "policy_statuschangedate"

$split$discrete
 [1] "countyname"      "nuts3name"       "cluster_id"      "prod_type"      
 [5] "prem_type"       "prem_freq"       "gender_life1"    "smoker_life1"   
 [9] "mortgage_status" "policy_status"  

$split$logical
[1] "isjointlife" "islifeonly"  "lapsed"     


$columns
             countyname               nuts3name              cluster_id 
             "discrete"              "discrete"              "discrete" 
              prod_type               prem_type               prem_freq 
             "discrete"              "discrete"              "discrete" 
               prem_ape               prem_risk        policy_startdate 
           "continuous"            "continuous"              "datetime" 
         policy_enddate         policy_duration             mort_rating 
             "datetime"            "continuous"            "continuous" 
            sum_assured               dob_life1            gender_life1 
           "continuous"              "datetime"              "discrete" 
           smoker_life1             isjointlife              islifeonly 
             "discrete"               "logical"               "logical" 
        mortgage_status           policy_status policy_statuschangedate 
             "discrete"              "discrete"              "datetime" 
                 lapsed            weeks_to_now         weeks_to_status 
              "logical"            "continuous"            "continuous" 
        policy_lifetime 
           "continuous" 

4.2 Logical Variables

Logical variables only take two values: TRUE or FALSE. It is useful to see missing data as well though, so we also plot the count of those.

Show code
logical_vars <- coltype_lst$split$logical |> sort()

for (plot_varname in logical_vars) {
  cat("--\n")
  cat(glue("{plot_varname}\n"))

  na_count <- explore_data_tbl |> pull(.data[[plot_varname]]) |> are_na() |> sum()

  plot_title <- glue("Barplot of Counts for Variable: {plot_varname} ({na_count} missing values)")

  explore_plot <- ggplot(explore_data_tbl) +
    geom_bar(aes(x = .data[[plot_varname]])) +
    xlab(plot_varname) +
    ylab("Count") +
    scale_y_continuous(labels = label_comma()) +
    ggtitle(plot_title) +
    theme(axis.text.x = element_text(angle = 30, vjust = 0.5))

  plot(explore_plot)
}
--
isjointlife

--
islifeonly

--
lapsed

4.3 Numeric Variables

Numeric variables are usually continuous in nature, though we also have integer data.

Show code
numeric_vars <- coltype_lst$split$continuous |> sort()

for (plot_varname in numeric_vars) {
  cat("--\n")
  cat(glue("{plot_varname}\n"))

  plot_var <- explore_data_tbl |> pull(.data[[plot_varname]])
  na_count <- plot_var |> are_na() |> sum()

  plot_var |> summary() |> print()

  plot_title <- glue("Histogram Plot for Variable: {plot_varname} ({na_count} missing values)")


  all_plot <- ggplot() +
    geom_histogram(aes(x = plot_var), bins = dataexp_hist_bins_count) +
    geom_vline(xintercept = mean(plot_var, na.rm = TRUE),
               colour = "red", size = 1.5) +
    geom_vline(xintercept = median(plot_var, na.rm = TRUE),
               colour = "green", size = 1.5) +
    xlab(plot_varname) +
    ylab("Count") +
    scale_x_continuous(labels = label_comma()) +
    scale_y_continuous(labels = label_comma()) +
    ggtitle(
      plot_title,
      subtitle = "(red line is mean, green line is median)"
      )

  pos_data_tbl <- explore_data_tbl |>
    filter(.data[[plot_varname]] >= 0) |>
    mutate(var_val = abs(.data[[plot_varname]]))

  pos_log_plot <- ggplot(pos_data_tbl) +
    geom_histogram(aes(x = var_val), bins = dataexp_hist_bins_count) +
    xlab(plot_varname) +
    ylab("Count") +
    scale_x_log10(labels = label_comma()) +
    scale_y_continuous(labels = label_comma()) +
    ggtitle("Positive Values")

  
  neg_data_tbl <- explore_data_tbl |>
    filter(.data[[plot_varname]] < 0) |>
    mutate(var_val = abs(.data[[plot_varname]]))

  neg_log_plot <- ggplot(neg_data_tbl) +
    geom_histogram(aes(x = var_val), bins = dataexp_hist_bins_count) +
    xlab(plot_varname) +
    ylab("Count") +
    scale_x_log10(labels = label_comma()) +
    scale_y_continuous(labels = label_comma()) +
    ggtitle("Negative Values")


  plot_grid(
      all_plot,
      NULL,
      pos_log_plot,
      neg_log_plot,
      nrow = 2
      ) |>
    print()
}
--
mort_rating   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  100.0   100.0   150.0   141.5   200.0   300.0 

--
policy_duration   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   5.00   10.00   20.00   17.75   20.00   35.00 

--
policy_lifetime     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
   0.1429   91.2857  260.8571  308.1308  484.4286 1043.5714 

--
prem_ape     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
3.637e+01 8.504e+02 1.886e+03 4.907e+03 4.428e+03 1.365e+06 

--
prem_risk     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
    25.98    607.42   1347.48   3504.96   3163.17 975008.71 

--
sum_assured   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 100000  200000  300000  435722  450000 5000000 

--
weeks_to_now     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
   0.1429  464.2857  664.2857  657.5921  875.5714 1356.4286 
--
weeks_to_status   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    0.0     0.0   113.0   197.8   304.4  1043.6 

4.4 Categorical Variables

Categorical variables only have values from a limited, and usually fixed, number of possible values

Show code
categorical_vars <- coltype_lst$split$discrete |> sort()

for (plot_varname in categorical_vars) {
  cat("--\n")
  cat(glue("{plot_varname}\n"))

  na_count <- explore_data_tbl |> pull(.data[[plot_varname]]) |> are_na() |> sum()

  plot_title <- glue("Barplot of Counts for Variable: {plot_varname} ({na_count} missing values)")

  standard_plot_tbl <- explore_data_tbl |>
    count(.data[[plot_varname]])

  standard_plot <- ggplot(standard_plot_tbl) +
    geom_bar(aes(x = .data[[plot_varname]], weight = n)) +
    xlab(plot_varname) +
    ylab("Count") +
    scale_x_discrete(labels = ~ abbreviate(.x, minlength = 10)) +
    scale_y_continuous(labels = label_comma()) +
    ggtitle(plot_title) +
    theme(axis.text.x = element_text(angle = 30, vjust = 0.5))

  standard_plot |> print()


  desc_plot_tbl <- explore_data_tbl |>
    pull(.data[[plot_varname]]) |>
    fct_lump(n = dataexp_cat_level_count) |>
    fct_count() |>
    mutate(f = fct_relabel(f, str_trunc, width = 15))

  desc_plot <- ggplot(desc_plot_tbl) +
    geom_bar(aes(x = fct_reorder(f, -n), weight = n)) +
    xlab(plot_varname) +
    ylab("Count") +
    scale_x_discrete(labels = ~ abbreviate(.x, minlength = 10)) +
    scale_y_continuous(labels = label_comma()) +
    ggtitle(plot_title) +
    theme(axis.text.x = element_text(angle = 30, vjust = 0.5))

  desc_plot |> print()
}
--
cluster_id

--
countyname

--
gender_life1

--
mortgage_status

--
nuts3name

--
policy_status

--
prem_freq

--
prem_type

--
prod_type

--
smoker_life1

4.5 Date/Time Variables

Date/Time variables represent calendar or time-based data should as time of the day, a date, or a timestamp.

Show code
datetime_vars <- coltype_lst$split$datetime |> sort()

for (plot_varname in datetime_vars) {
  cat("--\n")
  cat(glue("{plot_varname}\n"))

  plot_var <- explore_data_tbl |> pull(.data[[plot_varname]])
  na_count <- plot_var |> are_na() |> sum()

  plot_var |> summary() |> print()

  plot_title <- glue("Barplot of Dates/Times in Variable: {plot_varname} ({na_count} missing values)")


  explore_plot <- ggplot(explore_data_tbl) +
    geom_histogram(aes(x = .data[[plot_varname]]), bins = dataexp_hist_bins_count) +
    xlab(plot_varname) +
    ylab("Count") +
    scale_y_continuous(labels = label_comma()) +
    ggtitle(plot_title)

  plot(explore_plot)
}
--
dob_life1        Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
"1919-06-26" "1955-02-18" "1962-11-23" "1962-11-28" "1970-09-14" "1999-11-09" 

--
policy_enddate        Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
"1995-01-02" "2014-05-10" "2019-10-21" "2021-02-24" "2027-01-08" "2050-12-31" 

--
policy_startdate        Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
"1990-01-02" "1999-03-22" "2003-04-09" "2003-05-25" "2007-02-07" "2015-12-31" 

--
policy_statuschangedate        Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
"1990-02-02" "2003-05-24" "2007-06-17" "2007-03-10" "2011-06-13" "2015-12-31" 

5 Bivariate Facet Plots

We now move on to looking at bivariate plots of the data set.

A natural way to explore relationships in data is to create univariate visualisations facetted by a categorical value.

Show code
### _TEMPLATE_
### facet_varname <- ''
facet_varname <- "cluster_id"

dataexp_facet_count_max <- 3

5.1 Logical Variables

For logical variables we facet on barplots of the levels, comparing TRUE, FALSE and missing data.

Show code
logical_vars <- logical_vars[!logical_vars %in% facet_varname] |> sort()


for (plot_varname in logical_vars) {
  cat("--\n")
  cat(plot_varname)

  plot_tbl <- data_tbl |> filter(!are_na(.data[[plot_varname]]))

  explore_plot <- ggplot(plot_tbl) +
    geom_bar(aes(x = .data[[plot_varname]])) +
    facet_wrap(facet_varname, scales = "free") +
    xlab(plot_varname) +
    ylab("Count") +
    scale_y_continuous(labels = label_comma()) +
    ggtitle(glue("{facet_varname}-Faceted Histogram for Variable: {plot_varname}")) +
    theme(axis.text.x = element_text(angle = 30, vjust = 0.5))

  plot(explore_plot)
}
--
isjointlife

--
islifeonly

--
lapsed

5.2 Numeric Variables

For numeric variables, we facet on histograms of the data.

Show code
for (plot_varname in numeric_vars) {
  cat("--\n")
  cat(plot_varname)

  plot_tbl <- data_tbl |> filter(!are_na(.data[[plot_varname]]))

  explore_plot <- ggplot(plot_tbl) +
    geom_histogram(aes(x = .data[[plot_varname]]), bins = dataexp_hist_bins_count) +
    facet_wrap(facet_varname, scales = "free") +
    xlab(plot_varname) +
    ylab("Count") +
    scale_x_continuous(labels = label_comma()) +
    scale_y_continuous(labels = label_comma()) +
    ggtitle(glue("{facet_varname}-Faceted Histogram for Variable: {plot_varname}")) +
    theme(axis.text.x = element_text(angle = 30, vjust = 0.5))

  print(explore_plot)
}
--
mort_rating

--
policy_duration

--
policy_lifetime

--
prem_ape

--
prem_risk

--
sum_assured

--
weeks_to_now

--
weeks_to_status

5.3 Categorical Variables

We treat categorical variables like logical variables, faceting the barplots of the different levels of the data.

Show code
categorical_vars <- categorical_vars[!categorical_vars %in% facet_varname] |> sort()

for (plot_varname in categorical_vars) {
  cat("--\n")
  cat(plot_varname)

  plot_tbl <- data_tbl |>
    filter(!are_na(.data[[plot_varname]])) |>
    mutate(
      varname_trunc = fct_relabel(.data[[plot_varname]], str_trunc, width = 10)
      )

  explore_plot <- ggplot(plot_tbl) +
    geom_bar(aes(x = varname_trunc)) +
    facet_wrap(facet_varname, scales = "free") +
    xlab(plot_varname) +
    ylab("Count") +
    scale_x_discrete(labels = ~ abbreviate(.x, minlength = 10)) +
    scale_y_continuous(labels = label_comma()) +
    ggtitle(glue("{facet_varname}-Faceted Histogram for Variable: {plot_varname}")) +
    theme(axis.text.x = element_text(angle = 30, vjust = 0.5))

  plot(explore_plot)
}
--
countyname

--
gender_life1

--
mortgage_status

--
nuts3name

--
policy_status

--
prem_freq

--
prem_type

--
prod_type

--
smoker_life1

5.4 Date/Time Variables

Like the univariate plots, we facet on histograms of the years in the dates.

Show code
for (plot_varname in datetime_vars) {
  cat("--\n")
  cat(plot_varname)

  plot_tbl <- data_tbl |> filter(!are_na(.data[[plot_varname]]))

  explore_plot <- ggplot(plot_tbl) +
    geom_histogram(aes(x = .data[[plot_varname]]), bins = dataexp_hist_bins_count) +
    facet_wrap(facet_varname, scales = "free") +
    xlab(plot_varname) +
    ylab("Count") +
    scale_y_continuous(labels = label_comma()) +
    ggtitle(glue("{facet_varname}-Faceted Histogram for Variable: {plot_varname}")) +
    theme(axis.text.x = element_text(angle = 30, vjust = 0.5))

  plot(explore_plot)
}
--
dob_life1

--
policy_enddate

--
policy_startdate

--
policy_statuschangedate